In Class Exercise 6

Social Network

Huang Anni (Singapore Management University)
06-11-2022

1. Overview

In this take-home exercise, we reveal the patterns of life in Ohio, USA by creating data visualization with tmap.

With reference to point 2 in Challenge 1 of VAST Challenge 2022, the following questions will be addressed:

Consider the social activities in the community.

2. Set up

Before we start to draw graphs, there are some work to do:

Install and import packages

To draw social network plots, I use igraph.

devtools::install_github("itsleeds/od", build_vignettes = TRUE)
packages = c('ggiraph', 'plotly', 'tidyverse', 'DT','gganimate',
             'knitr', 'ggdist', 'scales', 'grid', 'gridExtra',
             'patchwork','ggsignif','gghighlight',"hrbrthemes",
             'readxl', 'gifski', 'gapminder','treemap', 'treemapify',
             'rPackedBar','ggridges','rmarkdown','crosstalk',
             'd3scatter','tidycensus','timetk','ggseas','lubridate',
             'ggrepel','doSNOW','data.table','ViSiElse','sf','tmap',
             'clock','dplyr','od','igraph', 'tidygraph', 'ggstatsplot',
             'ggraph', 'visNetwork', 'lubridate', 'clock',
             'tidyverse', 'graphlayouts','FunnelPlotR', 'plotly', 'knitr',
             'ggcorrplot','ggstatsplot','ggside')

for(p in packages) {
  if(!require(p, character.only = T)) {
    install.packages(p)
  }
  library(p, character.only = T)
}

Read in raw data

The data sets used in this take home exercise is from the social network journals of participants in Ohio City.

There are two data sets. One contains the nodes data and the other contains the edges (also know as link) data.

participants <- read_csv("./raw_data/Attributes/Participants.csv")
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
financial <- read_csv("./raw_data/Journals/FinancialJournal.csv")

Data processing

For participants:

For social graph:

For friends num:

participants$educationLevel<-factor(participants$educationLevel,ordered=TRUE,levels=c('Low','HighSchoolOrCollege',"Bachelors","Graduate"))
brks <- c(17, 20, 30, 40, 50, 60, Inf)
grps <- c('<=20', '21-30','31-40', '41-50', '51-60', '>60')
participants$Age_Group <- cut(participants$age, breaks=brks, labels = grps, right = FALSE)
brks <- c(0, 0.3, 0.5, 0.6, 1)
grps <- c('Really Sad', 'Sad','Neutral', 'Happy')
participants$Joviality_Group <- cut(participants$joviality, breaks=brks, labels = grps, right = FALSE)
income_par <- financial %>% 
  filter(category %in% c('Wage')) %>%
  group_by(participantId,month=lubridate::month(timestamp)) %>%
  summarise(wage = round(sum(amount),1)) %>%
  ungroup()%>%
  group_by(participantId) %>%
  summarise(wage = mean(wage)) %>%
  ungroup()
participants <- participants %>%
  inner_join(income_par, by = "participantId")

socialNetwork_edges <- social_network %>%
  group_by(from=participantIdFrom, to=participantIdTo) %>%
  filter(from!=to) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()

parId_in_socialNetwork <- union(unique(socialNetwork_edges$from),unique(socialNetwork_edges$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
socialNetwork_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
socialNetwork_nodes$id <- socialNetwork_nodes$participantId
socialNetwork_graph <- igraph::graph_from_data_frame(socialNetwork_edges, 
                                                     vertices = socialNetwork_nodes)%>%
  as_tbl_graph()

friends_num_df <- socialNetwork_edges %>%
  group_by(from) %>%
  filter(from!=to) %>%
  group_by(participantId = from) %>%
  summarise(friends_num = n()) %>%
  ungroup() %>%
  inner_join(participants, by = "participantId")

participants <- participants %>%
  inner_join(friends_num_df[c('participantId','friends_num')], by = "participantId")

connection_strength <- interaction_num_df %>%
  inner_join(friends_num_df[c('participantId','friends_num')],by='participantId') %>%
  mutate(strength = interaction_num/friends_num)

interaction_num_df <- socialNetwork_edges %>%
  group_by(participantId = from) %>%
  filter(participantId!=to) %>%
  summarise(interaction_num = sum(weight)) %>%
  ungroup() %>%
  inner_join(participants, by = "participantId")

top5_most_active<-interaction_num_df %>%
  arrange(desc(interaction_num)) %>%
  slice(1:5)
top5_most_active$id <- top5_most_active$participantId
top5_most_active_nodes <- top5_most_active 
top5_most_active_edges <- social_network %>%
  group_by(from=participantIdFrom, to=participantIdTo) %>%
  filter((from!=to)&
         (from %in% top5_most_active$id)) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()
parId_in_socialNetwork <- union(unique(top5_most_active_edges$from),unique(top5_most_active_edges$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
top5_most_active_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
top5_most_active_nodes$id <- top5_most_active_nodes$participantId

top5_most_active_graph <- igraph::graph_from_data_frame(top5_most_active_edges, 
                                                     vertices = top5_most_active_nodes)%>%
  as_tbl_graph()

edges_weekday <- social_network %>%
  filter (lubridate::month(timestamp)==1) %>%
  slice_sample(n=100000) %>%
  group_by(from=participantIdFrom, to=participantIdTo, weekday = lubridate::wday(timestamp)) %>%
  filter((from!=to)) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()
friends_num_df_weekday <- edges_weekday %>%
  group_by(from,weekday) %>%
  filter(from!=to) %>%
  summarise(friends_num = n()) %>%
  ungroup() %>%
  inner_join(participants[c('age','educationLevel','haveKids','participantId','Age_Group','joviality','Joviality_Group','wage','householdSize')], by = c("from"="participantId"))
parId_in_socialNetwork <- union(unique(edges_weekday$from),unique(edges_weekday$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
weekday_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
weekday_nodes$id <- weekday_nodes$participantId
weekday_graph <- igraph::graph_from_data_frame(edges_weekday, 
                                                     vertices = weekday_nodes)%>%
  as_tbl_graph()

rich_participants <- participants %>%
  arrange(desc(wage))  %>%
  slice(1:5)
rich_participants_edges <- social_network %>%
  group_by(from=participantIdFrom, to=participantIdTo) %>%
  filter((from!=to)&
         (from %in% rich_participants$participantId)) %>%
  summarise(weight = n()) %>%
  filter(weight > 1) %>%
  ungroup()
parId_in_socialNetwork <- union(unique(rich_participants_edges$from),unique(rich_participants_edges$to)) %>%
  sort()
parId_in_socialNetwork <- data.frame(participantId = parId_in_socialNetwork)
rich_participants_nodes <- participants %>% 
  inner_join(parId_in_socialNetwork, by = "participantId")
rich_participants_nodes$id <- rich_participants_nodes$participantId

rich_participants_graph <- igraph::graph_from_data_frame(rich_participants_edges, 
                                                     vertices = rich_participants_nodes)%>%
  as_tbl_graph()

Save the processed data

write_rds(edges_weekday, './data/edges_weekday.rds')
write_rds(friends_num_df_weekday,'./data/friends_num_df_weekday.rds')
write_rds(weekday_nodes, './data/nodes_weekday.rds')
write_rds(weekday_graph, './data/graph_weekday.rds')
write_rds(top5_most_active_graph, './data/top5_most_active_graph.rds')
write_rds(top5_most_active_nodes, './data/top5_most_active_nodes.rds')
write_rds(top5_most_active_edges, './data/top5_most_active_edges.rds')
write_rds(interaction_num_df,'./data/interaction_num.rds')
write_rds(socialNetwork_graph,'./data/socialNetwork_graph.rds')
write_rds(socialNetwork_nodes,'./data/socialNetwork_nodes.rds')
write_rds(socialNetwork_edges,'./data/socialNetwork_edges.rds')
write_rds(friends_num_df,'./data/friends_num.rds')
write_rds(connection_strength,'./data/connection_strength.rds')
write_rds(rich_participants_graph,"./data/rich_participants_graph.rds")

3. Analyze social network data

Read in processed data

rich_participants_graph <- read_csv("./data/rich_participants_graph.rds")
edges_weekday <- read_rds('./data/edges_weekday.rds')
weekday_nodes <- read_rds('./data/nodes_weekday.rds')
weekday_graph <- read_rds('./data/graph_weekday.rds')
friends_num_df_weekday <- read_rds('./data/friends_num_df_weekday.rds')
top5_most_active_graph <- read_rds('./data/top5_most_active_graph.rds')
top5_most_active_nodes <- read_rds('./data/top5_most_active_nodes.rds')
top5_most_active_edges <- read_rds('./data/top5_most_active_edges.rds')
interaction_num_df <- read_rds('./data/interaction_num.rds')
social_network <- read_csv("./raw_data/Journals/SocialNetwork.csv")
socialNetwork_graph <- read_rds('./data/socialNetwork_graph.rds')
socialNetwork_nodes <- read_rds('./data/socialNetwork_nodes.rds')
socialNetwork_edges <- read_rds('./data/socialNetwork_edges.rds')
friends_num_df <- read_rds('./data/friends_num.rds')
connection_strength <- read_rds('./data/connection_strength.rds')

Is there any relationship with people’s education level and social interaction times?

Suppose the first thing we want to inspect is the distribution of the number of social interactions for participants of different education levels. We also want to know if the mean differences in the number of social interaction between different education levels is statistically significant.

I apply ANOVA test to see if there’s any relationship between social interaction tims and people’s education level. We can see that there’s a huge difference between the median social interaction times within different groups. As we can see the median social activeness is positively correlated with degree level. People with higher degree is more active.

set.seed(1234)
myvars <- c("participantId","educationLevel", "interaction_num")
newdata <- interaction_num_df[myvars]
ggbetweenstats(
  data = newdata,
  outlier.tagging = TRUE, ## whether outliers should be flagged
  outlier.label = participantId, ## label to attach to outlier values
  outlier.label.args = list(color = "red"), ## outlier point label color
  ## turn off messages
  ggtheme = ggplot2::theme_gray(), ## a different theme
  package = "yarrr", ## package from which color palette is to be take
  palette = "info2", ## choosing a different color palette
  title = "Fig1. Comparison of social activeness in different education level",
  caption = "Source: VAST Challenge",
  x = educationLevel,
  y = interaction_num,
  type = "robust", ## type of statistics
  xlab = "Education Level", ## label for the x-axis
  ylab = "Social Interactions", ## label for the y-axis
  plot.type = "boxviolin", ## type of plot
)

Correlation analysis for people’s social status

To analyze people’s social health, I use two indicators:

According to the correlation plot, we can find that:

p<-ggcorrmat(
  data     = connection_strength[c('strength',"age",'joviality','wage','friends_num')],
  colors   = c("#B2182B", "white", "#4D4D4D"),
  title    = "Fig2. Correlalogram for social network dataset",
  subtitle = "Friends num:Joviality; Friends num:strength"
)
ggplotly(p)

The top-5 most active people with different Joviality Status

As we can see, people who are happy has a strong connection with each other.

ggraph(top5_most_active_graph,
       layout = "fr") + # random
  geom_edge_link(aes(width=weight,alpha=0.2)) +
  geom_node_point(aes(color=Joviality_Group, 
                      size = 0.3)) +
  labs(title = "Fig3: The top-5 most active people with different Joviality Status",
       subtitle = 'People who are happy has a strong connection with each other') +
  theme_void() +# remove gray background +
  facet_nodes(~Joviality_Group) 

People’s social network in different age groups

I also draw the graph for people’ social network on different weekdays, but there’s no significant change. We can seee that people in 40-50 are most socially active while people older than 60 are the least.

ggraph(top5_most_active_graph,
       layout = "nicely") + # random
  geom_edge_link(aes(width=weight,alpha=0.2)) +
  geom_node_point(aes(color=friends_num, 
                      size = 0.1)) +
  labs(title = "Fig4: People's social network in different age groups",
       subtitle = 'People in 40-50 are most socially active \n while people older than 60  are the least') +
  theme_void() +# remove gray background +
  facet_nodes(~Age_Group)

Rich people have less friends?

We can see that rich people have less friends.

library(ggside)
library(ggstatsplot)
library(dplyr, warn.conflicts = FALSE)
ggscatterstats(
  data            = friends_num_df,
  x               = wage,
  y               = friends_num,
  type            = "bayes",
  xlab            = "wage",
  ylab            = "friends num",
  title = "Fig5. Relationship of friends number and wages",
)

But the strength of connection is stronger.

library(ggside)
library(ggstatsplot)
library(dplyr, warn.conflicts = FALSE)
ggscatterstats(
  data            = connection_strength,
  x               = wage,
  y               = strength,
  type            = "bayes",
  xlab            = "wage",
  ylab            = "strength of connections",
  title = "Fig6. Relationship between strength of connections and wages",
)